In [1]:
import pandas as pd
import datetime
import numpy as np
The production data from these years follows the same file format.
We can therefore import using the same format and put the dataframes into a dictionary.
In 1990 we manually fix well API No: 21451, DUCKETT "A" and set it's well number to 1 as unspecified. Same in 1991.
In [2]:
dates_cols_oil = ["OIL."+str(i) for i in range(0, 12, 1)]
dates_cols_gas = ["GAS."+str(i) for i in range(0, 12, 1)]
dates_cols = dates_cols_oil + dates_cols_gas
headers_old_2003 = ['API_COUNTY', 'API_NUMBER', 'SUFFIX', 'WELL_NAME','WELL_NO', ' OPER_NO', 'OPER_SUFFIX',
'OPERATOR', 'ME', 'SECTION', 'TWP','RAN', 'Q4', 'Q3', 'Q2', 'Q1', 'LATITUDE', 'LONGITUDE', 'OTC_COUNTY',
'OTC_LEASE_NO', 'OTC_SUB_NO', 'OTC_MERGE', 'POOL_NO', 'CODE','FORMATION', 'OFB', 'ALLOWABLE_CLASS',
'ALLOWABLE_TYPE', ' PURCH_NO',
'PURCHASER', 'PURCH_SUFFIX', 'OFB.1',
'YEAR', 'JAN', 'OIL.0', 'GAS.0', 'FEB',
'OIL.1', 'GAS.1', 'MAR', 'OIL.2', 'GAS.2',
'APR', 'OIL.3', 'GAS.3', 'MAY', 'OIL.4',
'GAS.4', 'JUN', 'OIL.5', 'GAS.5', 'JUL',
'OIL.6', 'GAS.6', 'AUG', 'OIL.7', 'GAS.7',
'SEP', 'OIL.8', 'GAS.8', 'OCT', 'OIL.9',
'GAS.9', 'NOV', 'OIL.10', 'GAS.10', 'DEC',
'OIL.11', 'GAS.11']
headers_new_2004 = ['API_COUNTY', 'API_NUMBER', 'S', 'WELL_NAME','WELL_NO', ' OPER_NO',
'OPERATOR', 'ME', 'SECTION', 'TWP','RAN', 'Q4', 'Q3', 'Q2', 'Q1', 'LATITUDE', 'LONGITUDE', 'OTC_COUNTY',
'OTC_LEASE_NO', 'OTC_SUB_NO', 'OTC_MERGE', 'POOL_NO', 'CODE','FORMATION','ALLOWABLE_CLASS',
'ALLOWABLE_TYPE', ' PURCH_NO',
'PURCHASER', 'OFB.1',
'YEAR', 'JAN', 'OIL.0', 'GAS.0', 'FEB',
'OIL.1', 'GAS.1', 'MAR', 'OIL.2', 'GAS.2',
'APR', 'OIL.3', 'GAS.3', 'MAY', 'OIL.4',
'GAS.4', 'JUN', 'OIL.5', 'GAS.5', 'JUL',
'OIL.6', 'GAS.6', 'AUG', 'OIL.7', 'GAS.7',
'SEP', 'OIL.8', 'GAS.8', 'OCT', 'OIL.9',
'GAS.9', 'NOV', 'OIL.10', 'GAS.10', 'DEC',
'OIL.11', 'GAS.11']
df_in = None
production_data = {}
for i in range(1987, 2016, 1):
dates_oil = [ "OIL_"+str(datetime.date(i, j+1, 1)) for j in range(0, 12, 1)]
dates_gas = [ "GAS_"+str(datetime.date(i, j+1, 1)) for j in range(0, 12, 1)]
renamed_oil = {old: new for old, new in zip(dates_cols_oil, dates_oil)}
renamed_gas = {old: new for old, new in zip(dates_cols_gas, dates_gas)}
renamed_cols = {**renamed_oil, **renamed_gas}
if i != 1994: #No Data from 1994
print(i)
if i <= 2008:
df = None
if i < 2004:
df = pd.read_csv("../raw/"+str(i)+"prodn.txt", delimiter="|", skiprows=[0, 2], names=headers_old_2003)
else:
df = pd.read_csv("../raw/"+str(i)+"prodn.txt", delimiter="|", skiprows=[0, 2], names=headers_new_2004)
df_in = df.copy()
df.rename(index=str, columns=renamed_cols, inplace=True)
df = df.drop(['YEAR','JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL','AUG', 'SEP', 'OCT', 'NOV', 'DEC'], axis=1)
production_data[i] = df
else:
df = pd.read_csv("../raw/"+str(i)+"prodn.txt", delimiter="|")
df[["API_COUNTY", "API_NUMBER"]].apply(lambda x: pd.to_numeric(x, errors='coerce',downcast='integer'))
df_in = df.copy()
df.rename(renamed_cols)
production_data[i] = df
df_in.head()
Out[2]:
In [3]:
def filter_data(row):
buffer = []
for val in row:
val_parsed = None
try:
val_parsed = int(val)
except ValueError:
val_parsed = 0
buffer.append(val_parsed)
return np.array(buffer, dtype=np.int32)
In [4]:
gas_dataframe = None
gas_prod_dfs = []
for year in range(1987, 2016):
print(year)
if year != 1994:
gas_data = {}
filter_col = [col for col in list(production_data[year]) if col.startswith('GAS')]
yearly_prod_data = production_data[year]
for i in range(1, len(yearly_prod_data.index)):
row = yearly_prod_data.iloc[[i]]
api_num = row["API_NUMBER"].values.astype(np.int32)[0]
pdata = filter_data(row[filter_col].values[0])
gas_data[api_num] = pdata
months = pd.date_range(start=str(year)+'-01-01', periods=12, freq='M')
gas_dataframe = pd.DataFrame.from_dict(gas_data)
gas_dataframe = gas_dataframe.set_index(months)
gas_prod_dfs.append(gas_dataframe)
gas_dataframe.head()
Out[4]:
In [5]:
gas_all_df = pd.concat(gas_prod_dfs).fillna(0)
In [6]:
gas_all_df.to_hdf('../processed/gas/gas_data.h5','table', mode="w")
In [7]:
num_elements = []
zero_elements = 0
for column in gas_all_df:
col = gas_all_df[column]
try:
num = col[(col!=0) | (col != 0.0)]
num_elements.append(len(num))
except ValueError:
zero_elements+=1
In [8]:
print(np.mean(num_elements), zero_elements)
In [9]:
print(np.max(num_elements))
In [32]:
print(np.where( np.array(num_elements) >= 100 )[0].shape)
In [10]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1,1)
ax.hist(num_elements, bins=20)
Out[10]:
In [21]:
def get_col_length(df, length):
for column in df:
col = df[column]
try:
num = col[(col!=0) | (col != 0.0)]
if len(num) is length:
return column, col
except:
pass
In [34]:
api_num, get_col = get_col_length(gas_all_df, 150)
#num = get_col[(get_col!=0) | (get_col != 0.0)]
fig, ax = plt.subplots(1,1)
get_col.plot(ax=ax)
Out[34]:
In [62]:
indices = np.where(np.array(num_elements) >= 310 )
print (indices)
print(gas_all_df.columns[indices])
greater_two_hundred = gas_all_df[gas_all_df.columns[indices]]
new_data = []
for row in greater_two_hundred.values.T:
if max(row)>0:
new_data.append([item*1./max(row) for item in row])
else:
new_data.append(row)
new_data = np.transpose(new_data)
new_data = pd.DataFrame(new_data)
new_data.set_index(greater_two_hundred.index)
#new_data.plot(ax=ax, legend=False)
Out[62]:
In [69]:
fig, ax = plt.subplots(1,1, figsize=(12, 8))
new_data.plot(ax=ax, legend=False, colormap="Greys")
ax.set_xlabel(r"$Time$", fontsize=28)
ax.set_ylabel(r"$Normalized \ Gas \ Production$", fontsize=28)
fig.savefig("../img/production_max_values.png", dpi=300, layout="tight")
In [ ]: